Railtime Data Analysis

Comparison can be made with the official statistics: http://www.infrabel.be/nl/over-infrabel/stiptheid/rapporten/2015/2

Imports


In [ ]:
from railfetcher import *
from datetime import datetime, date, timedelta, time
from dateutil import parser
from mpl_toolkits.basemap import Basemap
import pymysql
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')

Help functions


In [ ]:
def toUnix(datetime):
    unix = datetime.strftime('%s')
    return unix

def convertDatetime(unix):
    dt = datetime.fromtimestamp(unix)
    return dt

def convertDate(unix):
    d = date.fromtimestamp(unix)
    return d

Database functions


In [ ]:
class RailDatabase():
    def __init__(self, isNew):
        if isNew:
            self.conn = pymysql.connect(host='localhost', port=3306, 
                                        user='jnevens', passwd='XXX', db='newrailDB')
        else:
            self.conn = pymysql.connect(host='localhost', port=3306, 
                                        user='jnevens', passwd='XXX', db='oldrailDB')

    def getAllRoutes(self, date):
        C = self.conn.cursor()
        C.execute('SELECT route_id FROM route WHERE date = %s', (date,))
        rows = C.fetchall()
        C.close()
        return rows

    def getStops(self, routeID):
        C = self.conn.cursor()
        C.execute('SELECT * FROM stop WHERE route_id = %s', (routeID,))
        rows = C.fetchall()
        C.close()
        return rows

    def getLastStop(self, routeID):
        C = self.conn.cursor()
        C.execute('SELECT * FROM stop WHERE route_id = %s ORDER BY arrival_datetime', 
                  (routeID,))
        rows = C.fetchall()
        last = rows[-1:]
        C.close()
        return last

    def getStations(self):
        C = self.conn.cursor()
        C.execute('SELECT name_nl FROM station')
        rows = C.fetchall()
        C.close()
        return rows
    
    def getStationName(self, stationID):
        C = self.conn.cursor()
        C.execute('SELECT name_nl FROM station WHERE station_id = %s', (stationID,))
        row = C.fetchone()
        C.close()
        return row[0]
    
    def getStationLoc(self, nameNL):
        C = self.conn.cursor()
        C.execute('SELECT latitude, longitude FROM station WHERE name_nl = %s', (nameNL,))
        row = C.fetchone()
        C.close()
        return row
    
class Config():
    def __init__(self, isNew):
        self.new = isNew
        
    def period(self):
        if self.new:
            return (date(2014, 12, 15), date(2015, 3, 22))
        else:
            return (date(2014, 10, 27), date(2014, 12, 14))

Old Timetable

Average delay per day on the entire network


In [ ]:
conf = Config(False)
start, stop = conf.period()
diff = stop - start
period = pd.date_range(start, stop)
zeros = np.zeros((diff.days+1, 2))

db = RailDatabase(False)

#DataFrame with worst-case and avg delay on entire network, per day
#Delay is computed in a worst-case scenario, i.e. max delay of a train
#divided by the amount of trains on that day.
#In avg-case scenario, i.e. avg of avg of arrival and avg of departure delay
#divided by the amount of trains on that day.
delays = pd.DataFrame(zeros, index = period, columns = ['Worst case', 'Avg case'])

while(start <= stop):
    t = time(0, 0, 0)
    dt = datetime.combine(start, t)

    #Get all routes for a specified date.
    #Every trainID rides only once each day, 
    #so there is no point in asking all the trainIDs first
    routes = db.getAllRoutes(toUnix(dt))

    #Instead of keeping a list of worst_delays {which is O(n) in memory}, 
    #the worst_delay is accumulated and 
    #divided by the amount of data points {which is O(1) in memory}
    total_worst_delay = 0
    worst_delay_count = 0
    
    total_avg_delay = 0
    avg_delay_count = 0
    
    for routeRow in routes:
        routeID = routeRow[0]
        stops = db.getStops(routeID)
        
        max_delay = 0
        
        total_arrival_delay = 0
        arrival_delay_count = 1
        total_departure_delay = 0
        departure_delay_count = 1
        
        for stopRow in stops:
            arrival_detected = stopRow[3]
            departure_detected = stopRow[6]
            arrival_delay = stopRow[2]
            departure_delay = stopRow[5]
            
            if arrival_detected:
                total_arrival_delay += arrival_delay
                arrival_delay_count += 1
                
            if departure_detected:
                total_departure_delay += departure_delay
                departure_delay_count += 1
            
            if max(arrival_delay, departure_delay) > max_delay:
                max_delay = max(arrival_delay, departure_delay)
        
        total_worst_delay += max_delay
        worst_delay_count += 1
        
        avg_arrival = float(total_arrival_delay) / float(arrival_delay_count)
        avg_departure = float(total_departure_delay) / float(departure_delay_count)
        
        total_avg_delay += np.mean([avg_arrival, avg_departure])
        avg_delay_count += 1
        
    key = start.isoformat()
    delays.loc[key, 'Worst case'] = float(total_worst_delay) / float(worst_delay_count)
    delays.loc[key, 'Avg case']   = float(total_avg_delay) / float(avg_delay_count)
        
    delta = timedelta(days=1)
    start = start + delta
    
delays

In [ ]:
delays.describe()

In [ ]:
rolling = pd.rolling_mean(delays, 4, center=True)
ax_delays = delays.plot(x_compat=True, style='--', color=['r', 'b'])
rolling.plot(color=['r','b'], ax=ax_delays, legend=0)
plt.title('Delays per day on entire network (old timetable)')
plt.xlabel('Date')
plt.ylabel('Minutes')
plt.savefig('./../../Paper/plots/old_delay_per_day.png')

Percentage of delayed trains per day on entire network


In [ ]:
conf = Config(False)
start, stop = conf.period()
diff = stop - start
period = pd.date_range(start, stop)
zeros = np.zeros((diff.days+1, 2))

db = RailDatabase(False)

#DataFrame with the number of delayed trains per day
#A train is considered delayed when it suffers a delay of more than 5 minutes at any stop
#The 2nd column is the percentage of delayed trains on that day.
perctDelays = pd.DataFrame(zeros, index = period, columns = ['Delayed', 'Percentage'])

while(start <= stop):
    t = time(0, 0, 0)
    dt = datetime.combine(start, t)

    #Get all routes for a specified date.
    #Every trainID rides only once each day, 
    #so there is no point in asking all the trainIDs first
    routes = db.getAllRoutes(toUnix(dt))
    
    key = start.isoformat()
    count = len(routes)
    
    #Get all stops for the route
    for routeRow in routes:
        routeID = routeRow[0]
        stops = db.getStops(routeID)
        
        #If a stop is found in the route, with > 5min delay
        #then this train is considered delayed!
        for stopRow in stops:
            arrival_detected = stopRow[3]
            departure_detected = stopRow[6]
            arrival_delay = stopRow[2]
            departure_delay = stopRow[5]
            
            if arrival_detected:
                if arrival_delay > 5:
                    perctDelays.loc[key, 'Delayed'] += 1
                    break
            elif departure_detected:
                if departure_delay > 5:
                    perctDelays.loc[key, 'Delayed'] += 1
                    break
                    
    perctDelays.loc[key,'Percentage']=(float(perctDelays.loc[key,'Delayed'])/float(count))*100.
        
    delta = timedelta(days=1)
    start = start + delta
    
perctDelays

In [ ]:
nov = perctDelays[5:35]
nov.describe()

In [ ]:
perctDelays.describe()

In [ ]:
fig = plt.figure()
plt.bar(perctDelays.index, perctDelays['Percentage'], color='b')
plt.title('Percentage of delayed trains per day (old timetable)')
plt.xlabel('Date')
plt.ylabel('Percentage')
plt.ylim(ymax=60)
fig.autofmt_xdate()
plt.savefig('./../../Paper/plots/old_percentage_delays.png')

Amount of cancelled trains per day


In [ ]:
conf = Config(False)
start, stop = conf.period()
period = pd.date_range(start, stop)

db = RailDatabase(False)

#Series with the amount of cancelled trains, per day
#A train is considered cancelled when arrival is not detected at its final stop
cancels = pd.Series(0, index = period)

while(start <= stop):
    t = time(0, 0, 0)
    dt = datetime.combine(start, t)
    key = start.isoformat()

    #Get all routes for a specified date.
    #Every trainID rides only once each day, 
    #so there is no point in asking all the trainIDs first
    routes = db.getAllRoutes(toUnix(dt))
    
    for routeRow in routes:
        routeID = routeRow[0]
        lastStop = db.getLastStop(routeID)
        
        for stopRow in lastStop:
            arrival_detected = stopRow[3]
            if not arrival_detected:
                cancels[key] += 1
                
    delta = timedelta(days=1)
    start = start + delta

cancels

In [ ]:
nov = cancels[5:35]
nov.sum()

In [ ]:
cancels['2014-11-24'] = 0
cancels['2014-12-01'] = 0
cancels['2014-12-08'] = 0
cancels['2014-12-11'] = 0

In [ ]:
cancels.describe()

In [ ]:
fig = plt.figure()
plt.bar(cancels.index, cancels, color='b')
plt.title('Cancelled trains per day (old timetable) (no strike)')
plt.xlabel('Date')
plt.ylabel('Cancelled trains')
plt.ylim(ymax=350)
fig.autofmt_xdate()
plt.savefig('./../../Paper/plots/old_cancels_per_day_no_strike.png')

Delays per hour

Only weekdays are used!


In [ ]:
conf = Config(False)
period = np.arange(24)
sr = pd.Series(0, index=period)

start, stop = conf.period()

while start <= stop:
    weekday = start.weekday()
    if weekday < 5:
        t = time(0, 0, 0)
        dt = datetime.combine(start, t)

        db = RailDatabase(False)
        routes = db.getAllRoutes(toUnix(dt))

        routeCount = 0
        tmp = pd.Series(0, index=period)

        for routeRow in routes:
            routeID = routeRow[0]
            stops = db.getStops(routeID)

            includeRoute = False

            for stopRow in stops:
                arrival_datetime = stopRow[1]
                arrival_delay = stopRow[2]
                arrival_detected = stopRow[3]
                departure_datetime = stopRow[4]
                departure_delay = stopRow[5]
                departure_detected = stopRow[6]

                if arrival_detected:
                    if arrival_delay > 0:
                        key = convertDatetime(arrival_datetime)
                        key = key.time().hour
                        tmp[key] += arrival_delay
                        includeRoute = True
                elif departure_detected:
                    if departure_delay > 0:
                        key = convertDatetime(departure_datetime)
                        key = key.time().hour
                        tmp[key] += departure_delay
                        includeRoute = True

            if includeRoute:
                routeCount += 1

        if routeCount == 0:
            routeCount = 1
        calcAverage = lambda x: float(x) / float(routeCount)
        tmp = tmp.map(calcAverage)

        for key in period:
            a = sr[key]
            b = tmp[key]
            if a == 0:
                sr[key] = b
            elif b != 0:
                sr[key] = np.mean([a, b])

    delta = timedelta(days=1)
    start = start + delta
            
sr

In [ ]:
sr.describe()

In [ ]:
plt.figure()
rolling = pd.rolling_mean(sr, 3, center=True)
ax_delays = sr.plot(style='--', color='b')
rolling.plot(color='b', ax=ax_delays, legend=0)
plt.xticks(np.arange(0, 24, 2))
plt.title('Delays per hour on entire network (old timetable)')
plt.xlabel('Hour')
plt.ylabel('Minutes')
plt.ylim(ymax=10)
plt.savefig('./../../Paper/plots/old_delay_per_hour.png')

Map stations


In [ ]:
db = RailDatabase(False)
stations = db.getAllStations()

m = Basemap(resolution='i', projection='merc', 
    llcrnrlat=49.0, urcrnrlat=52.0, llcrnrlon=1., urcrnrlon=8.0, lat_ts=51.0)
m.drawcountries()
m.drawcoastlines()
m.fillcontinents()
for stationRow in stations:
    x = stationRow[5]
    y = stationRow[6]
    lat, lon = m(y, x)
    m.plot(lat, lon, 'b.', alpha=0.5)
plt.title('Stations')
plt.savefig('./plots/stations.pdf')
plt.show()

New Timetable

Average delay per day on the entire network


In [ ]:
conf = Config(True)
start, stop = conf.period()
diff = stop - start
period = pd.date_range(start, stop)
zeros = np.zeros((diff.days+1, 2))

db = RailDatabase(True)

#DataFrame with worst-case and avg delay on entire network, per day
#Delay is computed in a worst-case scenario, i.e. max delay of a train
#and in avg-case scenario, i.e. max of avg of arrival and avg of departure delay
delays = pd.DataFrame(zeros, index = period, columns = ['Worst case', 'Avg case'])

while(start <= stop):
    t = time(0, 0, 0)
    dt = datetime.combine(start, t)

    #Get all routes for a specified date.
    #Every trainID rides only once each day, 
    #so there is no point in asking all the trainIDs first
    routes = db.getAllRoutes(toUnix(dt))

    #Instead of keeping a list of worst_delays {which is O(n) in memory}, 
    #the worst_delay is accumulated and 
    #divided by the amount of data points {which is O(1) in memory}
    total_worst_delay = 0
    worst_delay_count = 0
    
    total_avg_delay = 0
    avg_delay_count = 0
    
    for routeRow in routes:
        routeID = routeRow[0]
        stops = db.getStops(routeID)
        
        max_delay = 0
        
        total_arrival_delay = 0
        arrival_delay_count = 1
        
        total_departure_delay = 0
        departure_delay_count = 1
        
        for stopRow in stops:
            arrival_detected = stopRow[3]
            departure_detected = stopRow[6]
            arrival_delay = stopRow[2]
            departure_delay = stopRow[5]
            
            if arrival_detected:
                total_arrival_delay += arrival_delay
                arrival_delay_count += 1
                
            if departure_detected:
                total_departure_delay += departure_delay
                departure_delay_count += 1
            
            if max(arrival_delay, departure_delay) > max_delay:
                max_delay = max(arrival_delay, departure_delay)
        
        total_worst_delay += max_delay
        worst_delay_count += 1
        
        avg_arrival = float(total_arrival_delay) / float(arrival_delay_count)
        avg_departure = float(total_departure_delay) / float(departure_delay_count)
        
        total_avg_delay += np.mean([avg_arrival, avg_departure])
        avg_delay_count += 1
        
    key = start.isoformat()
    delays.loc[key, 'Worst case'] = float(total_worst_delay) / float(worst_delay_count)
    delays.loc[key, 'Avg case']   = float(total_avg_delay) / float(avg_delay_count)
        
    delta = timedelta(days=1)
    start = start + delta
    
delays

In [ ]:
delays.describe()

In [ ]:
rolling = pd.rolling_mean(delays, 4, center=True)
ax_delays = delays.plot(x_compat=True, style='--', color=['r', 'b'])
rolling.plot(color=['r','b'], ax=ax_delays, legend=0)
plt.title('Delay on entire network (new timetable)')
plt.xlabel('Date')
plt.ylabel('Minutes')
plt.ylim(ymax=9)
plt.savefig('./../../Paper/plots/new_delay_per_day.png')

Percentage of delayed trains per day on entire network


In [ ]:
conf = Config(True)
start, stop = conf.period()
diff = stop - start
period = pd.date_range(start, stop)
zeros = np.zeros((diff.days+1, 2))

db = RailDatabase(True)

#DataFrame with the number of delayed trains per day
#A train is considered delayed when it suffers a delay of more than 5 minutes at any stop
#The 2nd column is the percentage of delayed trains on that day.
perctDelays = pd.DataFrame(zeros, index = period, columns = ['Delayed', 'Percentage'])

while(start <= stop):
    t = time(0, 0, 0)
    dt = datetime.combine(start, t)

    #Get all routes for a specified date.
    #Every trainID rides only once each day, 
    #so there is no point in asking all the trainIDs first
    routes = db.getAllRoutes(toUnix(dt))
    
    key = start.isoformat()
    count = len(routes)
    
    #Get all stops for the route
    for routeRow in routes:
        routeID = routeRow[0]
        stops = db.getStops(routeID)
        
        #If a stop is found in the route, with > 5min delay
        #then this train is considered delayed!
        for stopRow in stops:
            arrival_detected = stopRow[3]
            departure_detected = stopRow[6]
            arrival_delay = stopRow[2]
            departure_delay = stopRow[5]
            
            if arrival_detected:
                if arrival_delay > 5:
                    perctDelays.loc[key, 'Delayed'] += 1
                    break
            elif departure_detected:
                if departure_delay > 5:
                    perctDelays.loc[key, 'Delayed'] += 1
                    break
                    
    perctDelays.loc[key,'Percentage']=(float(perctDelays.loc[key,'Delayed'])/float(count))*100.
        
    delta = timedelta(days=1)
    start = start + delta
    
perctDelays

In [ ]:
jan = perctDelays[17:48]
feb = perctDelays[49:76]
jan.describe()

In [ ]:
perctDelays.describe()

In [ ]:
fig = plt.figure()
plt.bar(perctDelays.index, perctDelays['Percentage'], color='r')
plt.title('Percentage of delayed trains per day (new timetable)')
plt.xlabel('Date')
plt.ylabel('Percentage')
plt.ylim(ymax=60)
fig.autofmt_xdate()
plt.savefig('./../../Paper/plots/new_percentage_delays.png')

Amount of cancelled trains per day


In [ ]:
conf = Config(True)
start, stop = conf.period()
period = pd.date_range(start, stop)

db = RailDatabase(True)

#Series with the amount of cancelled trains, per day
#A train is considered cancelled when arrival is not detected at its final stop
cancels = pd.Series(0, index = period)

while(start <= stop):
    t = time(0, 0, 0)
    dt = datetime.combine(start, t)
    key = start.isoformat()

    #Get all routes for a specified date.
    #Every trainID rides only once each day, 
    #so there is no point in asking all the trainIDs first
    routes = db.getAllRoutes(toUnix(dt))
    
    for routeRow in routes:
        routeID = routeRow[0]
        lastStop = db.getLastStop(routeID)
        
        for stopRow in lastStop:
            arrival_detected = stopRow[3]
            if not arrival_detected:
                cancels[key] += 1
                
    delta = timedelta(days=1)
    start = start + delta

cancels

In [ ]:
cancels['2014-12-15'] = 0

In [ ]:
jan = cancels[17:17+31]
jan.sum()

In [ ]:
cancels.describe()

In [ ]:
fig = plt.figure()
plt.bar(cancels.index, cancels, color='r')
plt.title('Cancelled trains per day (new timetable)')
plt.xlabel('Date')
plt.ylabel('Cancelled trains')
fig.autofmt_xdate()
plt.savefig('./../../Paper/plots/new_cancels_per_day.png')

In [ ]:
c = cancels[1:]
fig = plt.figure()
plt.bar(c.index, c, color='r')
plt.title('Cancelled trains per day (new timetable) (no strike)')
plt.xlabel('Date')
plt.ylabel('Cancelled trains')
fig.autofmt_xdate()
plt.savefig('./../../Paper/plots/new_cancels_per_day_no_strike.png')

Delays per hour

Only weekdays are used for this!


In [ ]:
conf = Config(True)
period = np.arange(24)
sr = pd.Series(0, index=period)

start = date(2014, 12, 15)
stop = date(2015, 3, 22)

while start <= stop:
    weekday = start.weekday()
    if weekday < 5:
        t = time(0, 0, 0)
        dt = datetime.combine(start, t)

        db = RailDatabase(True)
        routes = db.getAllRoutes(toUnix(dt))

        routeCount = 0
        tmp = pd.Series(0, index=period)

        for routeRow in routes:
            routeID = routeRow[0]
            stops = db.getStops(routeID)

            includeRoute = False

            for stopRow in stops:
                arrival_datetime = stopRow[1]
                arrival_delay = stopRow[2]
                arrival_detected = stopRow[3]
                departure_datetime = stopRow[4]
                departure_delay = stopRow[5]
                departure_detected = stopRow[6]

                if arrival_detected:
                    if arrival_delay > 0:
                        key = convertDatetime(arrival_datetime)
                        key = key.time().hour
                        tmp[key] += arrival_delay
                        includeRoute = True
                elif departure_detected:
                    if departure_delay > 0:
                        key = convertDatetime(departure_datetime)
                        key = key.time().hour
                        tmp[key] += departure_delay
                        includeRoute = True

            if includeRoute:
                routeCount += 1

        if routeCount == 0:
            routeCount = 1
        calcAverage = lambda x: float(x) / float(routeCount)
        tmp = tmp.map(calcAverage)

        for key in period:
            a = sr[key]
            b = tmp[key]
            if a == 0:
                sr[key] = b
            elif b != 0:
                sr[key] = np.mean([a, b])

    delta = timedelta(days=1)
    start = start + delta
            
sr

In [ ]:
sr.describe()

In [ ]:
plt.figure()
rolling = pd.rolling_mean(sr, 3, center=True)
ax_delays = sr.plot(style='--', color='r')
rolling.plot(color='r', ax=ax_delays, legend=0)
plt.xticks(np.arange(0, 24, 2))
plt.title('Delays per hour on entire network (new timetable)')
plt.xlabel('Hour')
plt.ylabel('Minutes')
plt.ylim(ymax=10)
plt.savefig('./../../Paper/plots/new_delay_per_hour.png')

In [ ]: